---
title: "G1000 Data Analysis"
author: "Klara Pigmans & Tim van Erven"
output: pdf_document
---

```{r echo=FALSE}
path <- "~/G1000 data/"

histcol <- "#4f81bd"
hist_cex_lab <- 1.4
hist_cex_axis <- 1.2

topic_groups <- vector("list",5)
topic_groups[[1]] <- c(2, 3, 6, 9, 14, 17, 18, 20, 21, 24, 27, 28)
topic_groups[[2]] <- c(32, 34, 35, 36, 39, 44, 45, 48, 51,53, 55)
topic_groups[[3]] <- c(64, 65, 67, 68, 71, 78, 79, 81, 85, 88)
topic_groups[[4]] <- c(92, 93, 95, 99, 101, 103,105, 106, 107, 112, 113, 115)
topic_groups[[5]] <- c(100, 122, 123, 124, 125, 126, 129, 131, 132, 134, 139,140, 144, 145, 146, 147)
```

Report produced: `r format(Sys.time(), '%d %B, %Y')`

#Loading Data

Data directory = `r path`

```{r echo=FALSE, message=FALSE}
library(ConsRank)
library(knitr)
opts_chunk$set(tidy=TRUE, tidy.opts=list(width.cutoff=60))
opts_chunk$set(echo=FALSE)
opts_chunk$set(linewidth=80)
opts_chunk$set(comment="")
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
 # this hook is used only when the linewidth option is not NULL
 if (!is.null(n <- options$linewidth)) {
   x = knitr:::split_lines(x)
   # any lines wider than n should be wrapped
   if (any(nchar(x) > n)) x = strwrap(x, width = n)
   x = paste(x, collapse = '\n')
 }
 hook_output(x, options)
})
```

```{r}
#### Load G1000 Data ####

# Read data file for group with careful input format checking
read_group_data <- function(group) {
  filename <- paste(path, "tafel", group, " - 75.csv", sep="")
  raw_data <- read.csv(filename, header=FALSE, colClasses="character")
 
  if (raw_data[2,1] != "Tafel nummer: ") {
    stop("Entry in field [2,1] of ", filename, " does not follow expected format")
  }
  if (is.integer(raw_data[2,3]) | as.integer(raw_data[2,3]) != group) {
    stop("Group number '", raw_data[2,3], "' in input file does not match expected group number '", group, "'")
  }
  for (i in 1:4) {
    if (raw_data[4,i*2] != paste("Alternatief", c('A', 'B', 'C', 'D')[i])) {
      stop("Line 4 of input file ", filename, "not as expected")
    }
    if (raw_data[5,i*2] != "Ranking 1" | raw_data[5,i*2+1] != "Ranking 2") {
      stop("Line 5 of input file ", filename, "not as expected")
    }
  }
  for (i in 1:12) {
    if (raw_data[i+5,1] != paste("Deelnemer", i)) {
      stop("Entry [", i+5, ",1] of input file not as expected")
    }
  }
  if (raw_data[18,1] != "Totaal") {
    stop("Entry [18,1] of input file not as expected")
  }
  
  
  npeople <- 0
  before <- matrix(nrow = 0, ncol = 4)
  after <- matrix(nrow = 0, ncol = 4)
  for (i in 1:12) {
    # Skip line if empty
    if (raw_data[i+5,2] == "") {
      for (j in 3:9) {
        if (raw_data[i+5,j] != "") {
          stop("Entry [", i+5, ",", j, "] of input file not empty as expected")
        }
      }
      next
    }
    
    # Read line
    npeople <- npeople + 1
    before <- rbind(before, rep(NA,4))
    after <- rbind(after, rep(NA,4))
    for (j in 1:4) {
      x <- raw_data[i+5,j*2]
      if (!x %in% c('1', '2', '3', '4')) {
        stop("Data item in field [", i+5, ",", j*2, "] not as expected (should be 1, 2, 3 or 4)")
      }
      before[npeople,j] <- as.integer(x) 
      
      y <- raw_data[i+5,j*2+1]
      if (!y %in% c('1', '2', '3', '4')) {
        stop("Data item in field [", i+5, ",", j*2+1, "] not as expected (should be 1, 2, 3 or 4)")
      }
      after[npeople,j] <- as.integer(y)
    }

    if (any(sort(before[npeople,]) != 1:4) |
        any(sort(after[npeople,]) != 1:4)) {
        stop("Invalid ranking for Deelnemer ", i)
    }
  }
 
  for (j in 1:4) {
    if (sum(before[,j]) != as.integer(raw_data[18,j*2])) {
      stop("Total in column ", j*2, " not as expected")
    }
    if (sum(after[,j]) != as.integer(raw_data[18,j*2+1])) {
      stop("Total in column ", j*2+1, " not as expected")
    }
  }
   
  return(list(before=before, after=after, npeople=npeople))
}

filelist <- list.files(path, pattern='tafel[0-9]+ - 75.csv')
groups <- as.integer(sub('tafel([0-9]+) - 75.csv', '\\1', filelist))
groups <- sort(groups)

#expected_groups <- c(2, 3, 6, 9, 14, 17, 18, 20, 21, 24, 27, 28, 32, 34, 35, 36, 39, 44, 45, 48, 51, 53, 55, 64, 65, 67, 68, 71, 78, 79, 81, 85, 88, 92, 93, 95, 99, 100, 101, 103, 105, 106, 107, 112, 113, 115, 122, 123, 124, 125, 126, 129, 131, 132, 134, 139, 140, 144, 145, 146, 147)
expected_groups <- sort(unlist(topic_groups))

if (length(groups) != length(expected_groups) | !all(groups == expected_groups)) {
  stop("Found groups do not match expected groups")
}

ngroups <- length(groups)
ranks_before <- vector("list", ngroups)
ranks_after <- vector("list", ngroups)
npeople <- numeric(ngroups)

cat("Reading groups: ")
for (i in 1:ngroups) {
  g <- groups[i]
  cat(g, " ", sep="")
  gdata <- read_group_data(g)
  ranks_before[[i]] <- gdata$before
  ranks_after[[i]] <- gdata$after
  npeople[i] <- gdata$npeople
}
```




#Initial Analysis

```{r}
#### Analyse G1000 Data ####

# Given a matrix in which the rows represent rankings in the format explained in rankexplain.R,
# compute the average Kemeny-Snell distance
avg_KemenyRankCorrelation <- function(X) {
  # Find consensus rankings using Emond and Mason's alg
  consensus <- EMCons(X, PS=FALSE)
  
  # Return avg tauX for first consensus ranking
  return(consensus$Tau[1])
  
  # Convert avg tauX for first consensus ranking to avg Kemeny-Snell distance
  #n <- ncol(X) # nr of choices to be ranked
  #return(0.5*n*(n-1) * (1 - consensus$Tau[1]))
}

# Compute group proximity for each group (before and after discussion)
avg_cor_before <- numeric(ngroups)
avg_cor_after <- numeric(ngroups)
for (group in 1:ngroups) {
  avg_cor_before[group] <- avg_KemenyRankCorrelation(ranks_before[[group]])
  avg_cor_after[group] <- avg_KemenyRankCorrelation(ranks_after[[group]])
}

# Difference of 'after' minus 'before'
avg_cor_diff <- avg_cor_after - avg_cor_before

par(cex.lab = hist_cex_lab, cex.axis=hist_cex_axis)
hist(avg_cor_before, main='Avg KS-Rank Correlation Before Deliberation', breaks=seq(0,1,0.25/6), ylim=c(0,12), ylab="Nr. of Groups", col=histcol)
cat("\n\n")
hist(avg_cor_after, main='Avg KS-Rank Correlation After Deliberation', breaks=seq(0,1,0.25/6),ylim=c(0,12), ylab="Nr. of Groups", col=histcol)
cat("\n\n")

hist(avg_cor_diff, main='Avg KS-Correlations Difference: After-Before Deliberation', breaks=seq(-3.25,3.25,0.25)/6, ylab="Nr. of Groups", col=histcol)
```

```{r}
# Print numeric summery of the data
cat("\n\nNumber of groups:", ngroups, "\n")
cat("Overview statistics of number of people in groups:\n")
print(summary(npeople))
cat("\nSummary of the average KS-Correlations Difference:\n")
print(summary(avg_cor_diff))
cat("Standard deviation:", sd(avg_cor_diff), "\n")
```


```{r}
cat("Avg group proximity:\n")
cat("Before: ", mean(avg_cor_before), "    After: ", mean(avg_cor_after),"\n")
```



## Analysis by Topic

```{r}
par(cex.lab = hist_cex_lab, cex.axis=hist_cex_axis)
topic_stats <- data.frame(row.names = c('Avg agreement before', 'Avg diff'))
extreme_groups <- data.frame(row.names = c('Group with smallest difference', 'Group with largest difference'))
extreme_group_values <- data.frame(row.names = c('Smallest difference', 'Largest difference'))
for (i in 1:length(topic_groups)) {
  topic_groups[[i]] <- sort(topic_groups[[i]])
  cat(length(topic_groups[[i]]), ' groups with topic ', i, ': ', sep="")
  cat(topic_groups[[i]], '\n', sep=" ")

  topic <- groups %in% topic_groups[[i]]
  if (sum(topic) != length(topic_groups[[i]])) {
    stop('Problem splitting into topics')
  }
  
  topic_avg_cor_before <- avg_cor_before[topic]
  topic_avg_cor_after <- avg_cor_after[topic]
  topic_avg_cor_diff <- avg_cor_diff[topic]
  if (!all(topic_avg_cor_diff == topic_avg_cor_after - topic_avg_cor_before)) {
    stop('Topic sanity check failed')
  }
  
  topic_stats[[i]] <- c(mean(topic_avg_cor_before), mean(topic_avg_cor_diff))
  
  best_group <- topic_groups[[i]][which.max(topic_avg_cor_diff)]
  best_group_value <- max(topic_avg_cor_diff)
  worst_group <- topic_groups[[i]][which.min(topic_avg_cor_diff)]
  worst_group_value <- min(topic_avg_cor_diff)
  
  if (avg_cor_diff[groups == best_group] != best_group_value |
      avg_cor_diff[groups == worst_group] != worst_group_value) {
    stop("Best/worst group sanity check failed")
  }
  
  extreme_groups[[i]] <- c(worst_group, best_group)
  extreme_group_values[[i]] <- c(worst_group_value, best_group_value)
  
  hist(topic_avg_cor_diff, main=paste('group proximity Difference for topic', i, ': After-Before Deliberation'), breaks=seq(-3.25,3.25,0.25)/6,ylim=c(0,6), ylab="Nr. of Groups", col=histcol)
}
colnames(topic_stats) <- paste("Topic", 1:length(topic_groups))
topic_stats_sorted <- topic_stats[,order(topic_stats[1,])]
kable(topic_stats_sorted, caption="For each topic: the avg group proximity before deliberation, and the avg difference in group proximity between deliberations. (I have sorted topics by avg group proximity before deliberation.)", digits=2)

colnames(extreme_groups) <- paste("Topic", 1:length(topic_groups))
kable(extreme_groups, caption="For each topic: the groups with the smallest and the largest difference in group proximity. (NB. I have not sorted the topics this time.)")

colnames(extreme_group_values) <- paste("Topic", 1:length(topic_groups))
kable(extreme_group_values, caption="The differences corresponding to the groups reported in the previous table.", digits=2)
```



## Splitting by Group Size

```{r}
barplot(table(npeople), main="Nr of people per group")
cat(" \n\n\n\n")
print(data.frame(table(npeople)))
#cat(" \n")

smallgroupmaxsize <- 5
largegroupminsize <- 7

smallgroups <- npeople <= smallgroupmaxsize
small_avg_cor_before <- avg_cor_before[smallgroups]
small_avg_cor_after <- avg_cor_after[smallgroups]
small_avg_cor_diff <- avg_cor_diff[smallgroups]

largegroups <- npeople >= largegroupminsize
large_avg_cor_before <- avg_cor_before[largegroups]
large_avg_cor_after <- avg_cor_after[largegroups]
large_avg_cor_diff <- avg_cor_diff[largegroups]
```

## All group proximities

```{r}
all_agreements <- data.frame(Group = groups, AgreementBefore = avg_cor_before, AgreementAfter = avg_cor_after, AgreementDifference = avg_cor_diff, NrOfPeople=npeople)
kable(all_agreements, caption="All group proximities, sorted by group", digits=4)
#write.csv(all_agreements, 'GroupAgreementsByGroup.csv', row.names=FALSE)

all_agreements <- all_agreements[order(avg_cor_before),]
kable(all_agreements, caption="All group proximities, sorted by group proximity before deliberation", digits=4, row.names=FALSE)
#write.csv(all_agreements, 'GroupAgreementsByAgreementBefore.csv', row.names=FALSE)

all_agreements <- all_agreements[order(all_agreements$NrOfPeople),]
kable(all_agreements, caption="All group proximities, sorted by nr. of people", digits=4, row.names=FALSE)
#write.csv(all_agreements, 'GroupAgreementsByNrOfPeople.csv', row.names=FALSE)
```
